
# This file mainly presents how to get Pathway 2's related analysis results

# Results can be obtained in this file include:

    ## Appendix 4.1: Logistic Regression Results (Pathway 2)
    ## Table 5: Predictive Accuracy Results  (Pathway 2)
    ## Table 8: Variance Inflation Factor Values of Variables (Pathway 2 part)
    ## Table 9: Models’ performance metrics (Pathway 2 part)

## R version: 4.4.2

#-----------------Prepare the libraries and the dataset------------------------#

rm(list = ls())  

library(readxl)  
library(tidyverse)  
library(fixest)        
library(lme4)          
library(pROC)          
library(ResourceSelection)
library(car)

options(scipen = 100)  

# Input the dataset 

PSIdata <- read_excel("~/Desktop/PSI.xlsx")  

#--------------Appendix 4.1: Logistic Regression Results (Pathway 2)-----------#

#1) Select the synthetic data---------------------------------------------------
# text_type==0 means UNPSA text; text_type==1 means synthetic (AI-generated) text

PSIdata1 <- subset(PSIdata, text_type==1)
PSIdata1 <- PSIdata1 %>%  
  mutate(  
    Country = as.factor(Country),  
    Year = as.factor(as.character(Year))
  )  

# 2) Create train/test splits --------------------------------------------------  

train_data5 <- PSIdata1 %>% filter(Year %in% c("2018", "2019"))  
test_data5  <- PSIdata1 %>% filter(Year %in% c("2020", "2021", "2022"))  

train_data6 <- PSIdata1 %>% filter(Year %in% c("2018", "2019", "2020"))  
test_data6  <- PSIdata1 %>% filter(Year %in% c("2021", "2022"))  

train_data7 <- PSIdata1 %>% filter(Year %in% c("2018", "2019", "2020", "2021"))  
test_data7  <- PSIdata1 %>% filter(Year == "2022")  


# predictor vector  

predictors <- c("innovation", "impact", "replicability",  
                "subjectivity", "numeric", "emotionality", "certainty","wordcount")  
pred_formula_rhs <- paste(predictors, collapse = " + ")  



# 3) Regression models------------------------------------------------

fit_glmer <- function(train_df, model_name) {  
  re_terms <- c()  
  if (n_distinct(train_df$Country) > 1) re_terms <- c(re_terms, "(1|Country)")  
  if (n_distinct(train_df$Year) > 1)    re_terms <- c(re_terms, "(1|Year)")  
  re_part <- if (length(re_terms) > 0) paste("+", paste(re_terms, collapse = " + ")) else ""  
  fmla <- as.formula(paste0("win ~ ", pred_formula_rhs, " ", re_part))  
  message("Fitting GLMM for ", model_name, ": ", deparse(fmla))  
  glmer(fmla, data = train_df, family = binomial(link = "logit"),  
        control = glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e5)))  
}  

# Fit all four GLMM models
model5 <- fit_glmer(train_data5, "Model 5")  
model6 <- fit_glmer(train_data6, "Model 6")  
model7 <- fit_glmer(train_data7, "Model 7")  
 

# Extract coefficients from GLMM models
extract_glmm_coefficients <- function(model, model_name) {
  coef_summary <- summary(model)$coefficients
  coef_df <- as.data.frame(coef_summary)
  coef_df$Variable <- rownames(coef_df)
  coef_df$Model <- model_name
  coef_df <- coef_df[, c("Model", "Variable", "Estimate", "z value", "Pr(>|z|)")]
  names(coef_df) <- c("Model", "Variable", "Coefficient", "z_value", "p_value")
  
  coef_df$Significance <- ifelse(coef_df$p_value < 0.001, "***",
                                 ifelse(coef_df$p_value < 0.01, "**",
                                        ifelse(coef_df$p_value < 0.05, "*", "")))
  return(coef_df)
}



# 4) Output results-------------------------------------------------------------

# Model 5 regression results---------------------------------------------------- 
coef_model5 <- extract_glmm_coefficients(model5, "Model 5 (Train: 2018-2019)")
kable(coef_model5[, c("Variable", "Coefficient", "z_value", "p_value", "Significance")],
      digits = 3,
      row.names = FALSE,
      caption = "MODEL 5 (Synthetic Winner of 2020-2022)")
cat("N (Train) =", nrow(train_data5), "observations\n")
cat("N (Test) =", nrow(test_data5), "observations\n\n")


# Model 6 regression results---------------------------------------------------- 
coef_model6 <- extract_glmm_coefficients(model6, "Model 6 (Train: 2018-2020)")
kable(coef_model6[, c("Variable", "Coefficient", "z_value", "p_value", "Significance")],
      digits = 3,
      row.names = FALSE,
      caption = "MODEL 6 (Synthetic Winner of 2021-2022)")
cat("N (Train) =", nrow(train_data6), "observations\n")
cat("N (Test) =", nrow(test_data6), "observations\n\n")


# Model 7 regression results---------------------------------------------------- 
coef_model7 <- extract_glmm_coefficients(model7, "Model 7 (Train: 2018-2021)")
kable(coef_model7[, c("Variable", "Coefficient", "z_value", "p_value", "Significance")],
      digits = 3,
      row.names = FALSE,
      caption = "MODEL 7 (Synthetic Winner of 2022)")
cat("N (Train) =", nrow(train_data7), "observations\n")
cat("N (Test) =", nrow(test_data7), "observations\n\n")


#-------------Table 5: Predictive Accuracy Results  (Pathway 2)---------------#


# 1) calculate the model performance metrics------------------------------------

calculate_pr_auc <- function(actual, probs) {
  if (length(actual) != length(probs)) {
    stop("Not match")
  }
  
  # remove NA value
  valid_idx <- !is.na(actual) & !is.na(probs)
  actual <- actual[valid_idx]
  probs <- probs[valid_idx]
  
  if (sum(actual) == 0) {
    return(0)
  }
  
  # calculate Precision-Recall curve
  pr_curve <- data.frame(
    threshold = seq(0, 1, by = 0.01),
    precision = NA,
    recall = NA
  )
  
  for (i in 1:nrow(pr_curve)) {
    threshold <- pr_curve$threshold[i]
    pred_class <- ifelse(probs >= threshold, 1, 0)
    
    # Calculate the confusion matrix
    tp <- sum(actual == 1 & pred_class == 1)
    tn <- sum(actual == 0 & pred_class == 0)
    fp <- sum(actual == 0 & pred_class == 1)
    fn <- sum(actual == 1 & pred_class == 0)
    
    # calculate Precision and Recall
    precision <- ifelse((tp + fp) > 0, tp / (tp + fp), 1)
    recall <- ifelse((tp + fn) > 0, tp / (tp + fn), 0)
    
    pr_curve$precision[i] <- precision
    pr_curve$recall[i] <- recall
  }
  
  pr_curve <- pr_curve[order(pr_curve$recall, -pr_curve$precision), ]
  pr_curve <- pr_curve[!duplicated(pr_curve$recall), ]
  
  # calculate the PR AUC 
  pr_auc <- 0
  for (i in 2:nrow(pr_curve)) {
    width <- pr_curve$recall[i] - pr_curve$recall[i-1]
    height <- (pr_curve$precision[i] + pr_curve$precision[i-1]) / 2
    pr_auc <- pr_auc + width * height
  }
  
  return(pr_auc)
}

calculate_metrics <- function(actual, predicted, probs) {
  conf_matrix <- table(actual, predicted)
  accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
  recall <- conf_matrix[2,2] / sum(conf_matrix[2,])
  specificity <- conf_matrix[1,1] / sum(conf_matrix[1,])
  precision <- conf_matrix[2,2] / sum(conf_matrix[,2])
  f1_score <- 2 * (precision * recall) / (precision + recall)
  
  # ROC AUC
  roc_obj <- roc(actual, probs)
  auc_val <- auc(roc_obj)
  
  # PR AUC
  pr_auc_val <- calculate_pr_auc(actual, probs)
  
  return(list(
    confusion_matrix = conf_matrix,
    accuracy = accuracy,
    recall = recall,
    specificity = specificity,
    precision = precision,
    f1_score = f1_score,
    auc = auc_val,
    pr_auc = pr_auc_val
  ))
}

pred_prob5 <- predict(model5, newdata = test_data5, type = "response", allow.new.levels = TRUE)  
pred_prob6 <- predict(model6, newdata = test_data6, type = "response", allow.new.levels = TRUE)  
pred_prob7 <- predict(model7, newdata = test_data7, type = "response", allow.new.levels = TRUE)  
 

pred_class5 <- ifelse(pred_prob5 >= 0.5, 1, 0)  
pred_class6 <- ifelse(pred_prob6 >= 0.5, 1, 0)  
pred_class7 <- ifelse(pred_prob7 >= 0.5, 1, 0)  
 

# 2) output the predictive accuracy results (in Table 5)------------------------

metrics5 <- calculate_metrics(test_data5$win, pred_class5, pred_prob5)
metrics6 <- calculate_metrics(test_data6$win, pred_class6, pred_prob6)
metrics7 <- calculate_metrics(test_data7$win, pred_class7, pred_prob7)


# Predictive Accuracy Results of Model 5----------------------------------------
print(metrics5) 

# Predictive Accuracy Results of Model 6----------------------------------------
print(metrics6) 

# Predictive Accuracy Results of Model 7----------------------------------------
print(metrics7) 
 

#---------Table 8: Variance Inflation Factor Values of Variables--------------#
# Models 5, 6, 7

round(vif(model5),3)
round(vif(model6),3)
round(vif(model7),3)
 


#-------------------------Table 9: Models’ performance metrics----------------#
# Models 5, 6, 7 part

# 1) construct the performance table--------------------------------------------

performance_table <- data.frame(
  Model = c("Model 5", "Model 6", "Model 7"),
  Precision = c(metrics5$precision, metrics6$precision, metrics7$precision),
  Recall = c(metrics5$recall, metrics6$recall, metrics7$recall),
  F1_Score = c(metrics5$f1_score, metrics6$f1_score, metrics7$f1_score),
  PR_AUC = c(metrics5$pr_auc, metrics6$pr_auc, metrics7$pr_auc)
)

performance_table[, -1] <- round(performance_table[, -1], 3)

# 2) Output the performance results---------------------------------------------

cat("\nPerformance Metrics Table:\n")
print(performance_table, row.names = FALSE)


